#!/usr/bin/perl
#Name    	: 3_Preliminary_validation_SS.pl
#Author  	: Morgan, Matthew
#Created 	: 04/2011
#Modified	: 02/2012
#Purpose	: Preliminary Validation of filtered unique sequences within groups for single samples
#Syntax		: perl 3_Preliminary_validation_SS.pl [input txt file] [col for cluster data] [col for total read data] [col with first MID read data] [number of consecutive MID columns]
#Further info	: Further information regarding this script and APDP can be found in the documentation downloaded with this file, and in Morgan et al., (in review)
#Copyright (c) 2010, 2012 Commonwealth Scientific and Industrial Research Organisation (CSIRO) ABN 41 687 119 230.

#########################################################################################################################################################	
#																			#
#CSIRO Open Source Software License Agreement (GPLv3)													#
#																			#
#Copyright (c) 2010, 2012 Commonwealth Scientific and Industrial Research Organisation (CSIRO) ABN 41 687 119 230.					#
#																			#
#All rights reserved. CSIRO is willing to grant you a license to APDP on the terms of the GNU General Public License version 3				#
# as published by the Free Software Foundation (http://www.gnu.org/licenses/gpl.html), except where otherwise indicated for third party material.	#
#The following additional terms apply under clause 7 of that license:											#
#																			#
#EXCEPT AS EXPRESSLY STATED IN THIS LICENCE AND TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, THE SOFTWARE IS PROVIDED "AS-IS". CSIRO AND ITS		#
#CONTRIBUTORS MAKE NO REPRESENTATIONS, WARRANTIES OR CONDITIONS OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY REPRESENTATIONS,	#
#WARRANTIES OR CONDITIONS REGARDING THE CONTENTS OR ACCURACY OF THE SOFTWARE, OR OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE,		#
#NON-INFRINGEMENT, THE ABSENCE OF LATENT OR OTHER DEFECTS, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT DISCOVERABLE.				#
#																			#
#TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL CSIRO OR ITS CONTRIBUTORS BE LIABLE ON ANY LEGAL THEORY (INCLUDING, WITHOUT		#
#LIMITATION, IN AN ACTION FOR BREACH OF CONTRACT, NEGLIGENCE OR OTHERWISE) FOR ANY CLAIM, LOSS, DAMAGES OR OTHER LIABILITY HOWSOEVER INCURRED.		#
#WITHOUT LIMITING THE SCOPE OF THE PREVIOUS SENTENCE THE EXCLUSION OF LIABILITY SHALL INCLUDE: LOSS OF PRODUCTION OR OPERATION TIME, LOSS,		#
#DAMAGE OR CORRUPTION OF DATA OR RECORDS; OR LOSS OF ANTICIPATED SAVINGS, OPPORTUNITY, REVENUE, PROFIT OR GOODWILL, OR OTHER ECONOMIC LOSS;		#
#OR ANY SPECIAL, INCIDENTAL, INDIRECT, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES, ARISING OUT OF OR IN CONNECTION WITH THIS LICENCE, THE USE		#
#OF THE SOFTWARE OR THE USE OF OR OTHER DEALINGS WITH THE SOFTWARE, EVEN IF CSIRO OR ITS CONTRIBUTORS HAVE BEEN ADVISED OF THE POSSIBILITY OF		#
#SUCH CLAIM, LOSS, DAMAGES OR OTHER LIABILITY.														#
#																			#
#APPLICABLE LEGISLATION SUCH AS THE AUSTRALIAN CONSUMER LAW MAY IMPLY REPRESENTATIONS, WARRANTIES, OR CONDITIONS, OR IMPOSES OBLIGATIONS		#
#OR LIABILITY ON CSIRO OR ONE OF ITS CONTRIBUTORS IN RESPECT OF THE SOFTWARE THAT CANNOT BE WHOLLY OR PARTLY EXCLUDED, RESTRICTED OR			#
#MODIFIED "CONSUMER GUARANTEES".  IF SUCH CONSUMER GUARANTEES APPLY THEN THE LIABILITY OF CSIRO AND ITS CONTRIBUTORS IS LIMITED, TO THE FULL		#
#EXTENT PERMITTED BY THE APPLICABLE LEGISLATION.  WHERE THE APPLICABLE LEGISLATION PERMITS THE FOLLOWING REMEDIES TO BE PROVIDED FOR BREACH OF		#
#THE CONSUMER GUARANTEES THEN, AT ITS OPTION, CSIRO'S LIABILITY IS LIMITED TO ANY ONE OR MORE OF THEM:							#
#1.          THE REPLACEMENT OF THE SOFTWARE, THE SUPPLY OF EQUIVALENT SOFTWARE, OR SUPPLYING RELEVANT SERVICES AGAIN;					#
#2.          THE REPAIR OF THE SOFTWARE; 														#
#3.          THE PAYMENT OF THE COST OF REPLACING THE SOFTWARE, OF ACQUIRING EQUIVALENT SOFTWARE, HAVING THE RELEVANT SERVICES SUPPLIED AGAIN,		#
#	     OR HAVING THE SOFTWARE REPAIRED.														#
#																			#
#########################################################################################################################################################

use warnings;
use strict;

my (@hashes, %hashes, $read, $firstmid, $nmids, @parts, @slice, $cluster, $clusters, @clusters, %genbank, $clustername, $seq, $hit, $bitscore, $nreads);
my ($input, %hithash, @totalkeys, @allkeys, @tophits, %midcount);

$input = $ARGV[0];
$clusters = $ARGV[1]; # col number for clusters
$read = $ARGV[2];     # col number for total reads
$firstmid = $ARGV[3]; # col number for first mid
$nmids = $ARGV[4];    # number of mids

open INFILE, "<$input"||die("Can't open infile");
open OUTFILE,  ">Top_group_Preliminary_Validated_sequences.txt";
open OUTFILE1, ">Additional_group_validated_seqs.txt";
open OUTFILE2, ">Preliminary_valid_sequences_by_group.txt";
open OUTFILE3, ">Preliminary_valid_sequence_names.txt";
open( REJ,  ">Top_group_rejected_rare_sequences.txt" );
while (<INFILE>)
	{
		if ($_ =~ m/^name/)
			{ 
			chomp $_;
			@parts = split(/\t/,$_);
			my $totalreads = $read - 1;
			my $midstart = $firstmid - 1;
			my $midend = $firstmid + $nmids - 2;
			@slice = @parts[$totalreads, $midstart..$midend];
			print "@slice\n";
			}	
		else
			{
			chomp $_;
			my @tmp = split(/\t/,$_);
			$seq = $tmp[0];
			$clustername = $tmp[$clusters - 1];
				if (defined($genbank{$clustername}))
                                	{
                                	$genbank{$clustername} ++;
                                	}
                        	else
                                	{
                                	$genbank{$clustername} = 1;
                                	}
                	}
	}

close (INFILE);

print "Set read proportion cut-off (default = 0.50) : ";
my $prop = <STDIN>;
chomp($prop);
if ( !$prop ) {
    $prop = 0.50;
}
print "Cut-off set to $prop of most abundant sequence\n";

@clusters = sort keys %genbank;
foreach my $cluster (@clusters)
	{
	print "$cluster\n";
	%hithash = ();
	my @hashes;
	foreach my $mid (@slice)
		{
		push @hashes, {}; 
		}
	open INFILE, "<$input"||die("Can't open infile");
	while (<INFILE>)
		{
		unless ( $_ =~ m/^name|^\s+/ ) {
			chomp $_;
                	my @tmp = split(/\t/,$_);
                	$seq = $tmp[0];
                	$hit = $tmp[$clusters - 1];
			if ( $hit eq $cluster )
				{ 
				$hashes[0]{$seq} = $tmp[$read - 1];
				for (my $i=1; $i<=$nmids ; $i++)
					{
					$hashes[$i]{$seq} = $tmp[$i + $firstmid - 2 ];
					if ($tmp[$i + $firstmid - 2 ] > 0) { 
						if (defined($midcount{$seq})){
							$midcount{$seq}++;
						}
						else{
							$midcount{$seq}=1;
						}
					} 

					}
				}
			}
		}

close (INFILE);
	my $elem_count=0;
	my $best;
	foreach my $elem (@hashes){ 
	  $elem_count++;
	  my %subhash = %$elem;
          my @sorted_keys = ();
          my @sorted_reads = ();
          my $tophit = ();
          my $numreads = ();
          my @tophits = ();
	  if ( keys %subhash ) { 
	    @sorted_keys = sort {$subhash{$b} <=> $subhash{$a}} keys %subhash;
	    @sorted_reads = @subhash{@sorted_keys};
	    $tophit = $sorted_keys[0];
            $numreads = $sorted_reads[0];
	      if ($elem_count == 1) {
		$best = $tophit;
		if ($numreads >= 2){
		  print "$best\n";
		  @tophits = grep { $subhash{$_} == $numreads } keys %subhash;
                  foreach my $key (@tophits){
		       	print OUTFILE "$key\t$cluster\t$numreads\t$elem_count\n";
                    	if (defined($hithash{$key})){$hithash{$key} ++;}
                    	else{$hithash{$key} = 1;}
		  }
                }
	      }
	      elsif ($elem_count > 1) {
		my $score = $subhash{$best};
		my $cutoff = $score * $prop;
		if ($cutoff < 1){$cutoff = 1;}
		if ($numreads > 1){ 
		  @tophits = grep { $subhash{$_} > $cutoff } keys %subhash;
                  foreach my $key (@tophits) {
		    if (($score<=10) && ($subhash{$key}<=10)){
		      unless ($key eq $best){
			print "$key won but with too few reads\n\n";
			print REJ "$key\t$elem_count\t$subhash{$key}\t$cluster\t$score\n";
		      }
		    }
		    else { 			
		      unless ($key eq $best){
			print OUTFILE1 "$key\t$cluster\t$numreads\t$elem_count\n";
			if (defined($hithash{$key})){$hithash{$key} ++;}
			else{ $hithash{$key} = 1;}
		      }
		    }
		  }#close foreach
                 }#close if
               }#close elsif	
	     }#close if keys subhash
	  }#close foreach my elem
	  my @totalkeys = keys %hithash;
          my $len = scalar(@totalkeys);
          print OUTFILE2 "$cluster\t$len\t@totalkeys\n";
          for (@totalkeys){
	    push @allkeys, $_;
	  }	
}#close foreach cluster

print OUTFILE3 "name\n";
for (@allkeys){
  print OUTFILE3 "$_\n";
}

close (OUTFILE);
close (OUTFILE1);
close (OUTFILE2);
close (OUTFILE3);

my %stor;
open OUT,   ">Preliminary_validated_sequences.txt";
open FILE2, "<$input";
while (<FILE2>) {
    if ( $_ =~ m/^name\s+/ ) {
	chomp;        
	print OUT "name\t$_\n";
    }
    else {
        s/\r?\n//;
        my @temp2 = split( /\t/, $_ );
        $stor{ $temp2[0] } = $_;
    }
}
open FILE1, "<Preliminary_valid_sequence_names.txt";
while (<FILE1>) {
    s/\r?\n//;
    my @temp1 = split /\t/, $_;
    my $cand = $stor{ $temp1[0] };
    if ($cand) {
        $cand =~ s/^/$_\t/gm;
        print OUT "$cand\n";
    }
}
close (OUT);
print "\n";
print time - $^T . "seconds taken\n";
print "Done.\n";
